require(hierfstat)
require(PopGenReport)
require(poppr)
require(genepop)
require(graph4lg)
require(related)
require(adegenet)
require(knitr)
require(tidyverse)
require(magrittr)

Readme

This is document is an R notebook. If you’d like view to pre-rendered figures, read a summary of analysis and interact with code, please open the relevant html file in a browser. Alternatively the notebook is published at https://rpubs.com/david_dayan/coquille_2021

To conduct a similar analyses on your computer, edit or run code: clone this repository into a directory on your local machine and open the .Rproj file in Rstudio. Files are stored on the github repository here: https://github.com/david-dayan/coquille_river_2021

Rationale / Brief Methods

The goal of this notebook is to assess if any genetic structure/differentiation can be observed between hatchery and wild stocks of North and South Fork Coquille River O. mykiss using a genetic panel of 391 SNPs.

We conduct several brief analyses:

  • PCA
  • DAPC
  • Estimate Fst
  • Summarize Variation at Known Run-Timing Markers

Data Summary

91 individuals were genotyped at 391 GTseq genetic markers including presumably neutral and putatively adaptive genetic markers. Sample sizes and summary metadata for the the unfiltered data is below. All samples are described at winter steelhead

Unfiltered Sample Metadata

sheet1 <- readxl::read_xlsx("metadata/GT-seq_GC3F-CKF-005_metadata.xlsx", sheet = 1)
sheet2 <- readxl::read_xlsx("metadata/GT-seq_GC3F-CKF-005_metadata.xlsx", sheet = 2)

meta_data <- sheet1 %>%
  bind_rows(sheet2) %>%
  select(-1) %>%
  mutate(pop = str_sub(`SFGL Id`, 8, 11))

kable(meta_data %>%
  group_by(pop, `Hat/Wild`) %>%
  summarise(n = n()) )
pop Hat/Wild n
NFCQ Hatchery 15
NFCQ Wild 26
SFCQ Hatchery 22
SFCQ Wild 28
kable(meta_data %>%
  group_by(pop, Date) %>%
  summarise(n = n()) )
pop Date n
NFCQ 2016-03-30 15
NFCQ 2016-07-11 10
NFCQ 2016-07-13 16
SFCQ 2016-02-17 7
SFCQ 2016-03-01 10
SFCQ 2016-03-30 12
SFCQ 2016-07-12 21
kable(meta_data %>%
  group_by(pop, `Adult/Juv`) %>%
  summarise(n = n()) )
pop Adult/Juv n
NFCQ Juvenile 41
SFCQ Adult 17
SFCQ Juvenile 33
rm(sheet1)
rm(sheet2)

Filtered Dataset
After filtering the GTseq dataset for genotype quality 71 individuals genotyped at 347 markers remained. Full details of the genotype calling and filtering is available in the notebook here. This information is also available in the project github repository. Summary information is below.

load("genotype_data/genind_2.0.R")
load("genotype_data/genotypes_2.2.R")

genos_2.2 %<>%
  left_join(select(meta_data, `Adult/Juv`, `SFGL Id`), by = c("sample" = "SFGL Id")) %>%
  relocate(sample, `Adult/Juv`)

kable(genos_2.2 %>%
  group_by(pop, `Hat/Wild`) %>%
  summarise(n = n()) )
pop Hat/Wild n
NFCQ Hatchery 13
NFCQ Wild 21
SFCQ Hatchery 19
SFCQ Wild 18
kable(genos_2.2 %>%
  group_by(pop, Date) %>%
  summarise(n = n()) )
pop Date n
NFCQ 2016-03-30 13
NFCQ 2016-07-11 9
NFCQ 2016-07-13 12
SFCQ 2016-02-17 5
SFCQ 2016-03-01 7
SFCQ 2016-03-30 11
SFCQ 2016-07-12 14
kable(genos_2.2 %>%
  group_by(pop, `Adult/Juv`) %>%
  summarise(n = n()) )
pop Adult/Juv n
NFCQ Juvenile 34
SFCQ Adult 12
SFCQ Juvenile 25

Failed samples
An unusually large number of individuals failed genotyping. Metadata from these failed individuals is below. Most (15 of 20) were filtered because of very low on-target read depth. Three additional individuals were filtered due to moderately poor read depth/genotyping success rate and two were removed because of possible contamination. Filtered individuals were not enriched for a particular metadata variable (e.g. all of the filtered individuals were not adult hatchery samples or juvenile NFCQ YoY). Further details also available in genotyping log.

kable(meta_data %>%
  filter(!(`SFGL Id` %in% genos_2.2$sample)) %>%
  select(-c(Pedigree, `Vial #`)))
Date Hat/Wild Adult/Juv Sex Est. Age SFGL Id pop
2016-03-30 Hatchery Juvenile NA 1 OmyJC16NFCQ_0042 NFCQ
2016-03-30 Hatchery Juvenile NA 1 OmyJC16NFCQ_0011 NFCQ
2016-07-11 Wild Juvenile NA 1+ OmyJC16NFCQ_0044 NFCQ
2016-07-13 Wild Juvenile NA YoY OmyJC16NFCQ_0037 NFCQ
2016-07-13 Wild Juvenile NA YoY OmyJC16NFCQ_0018 NFCQ
2016-07-13 Wild Juvenile NA YoY OmyJC16NFCQ_0033 NFCQ
2016-07-13 Wild Juvenile NA 1+ OmyJC16NFCQ_0008 NFCQ
2016-02-17 Wild Adult M NA OmyAC16SFCQ_0005 SFCQ
2016-02-17 Wild Adult M NA OmyAC16SFCQ_0006 SFCQ
2016-03-01 Wild Adult M NA OmyAC16SFCQ_0009 SFCQ
2016-03-01 Hatchery Adult M NA OmyAC16SFCQ_0016 SFCQ
2016-03-01 Hatchery Adult M NA OmyAC16SFCQ_0017 SFCQ
2016-03-30 Hatchery Juvenile NA 1 OmyJC16SFCQ_0024 SFCQ
2016-07-12 Wild Juvenile NA 1+ OmyJC16SFCQ_0027 SFCQ
2016-07-12 Wild Juvenile NA YoY OmyJC16SFCQ_0043 SFCQ
2016-07-12 Wild Juvenile NA 1+ OmyJC16SFCQ_0038 SFCQ
2016-07-12 Wild Juvenile NA YoY OmyJC16SFCQ_0034 SFCQ
2016-07-12 Wild Juvenile NA 1+ OmyJC16SFCQ_0026 SFCQ
2016-07-12 Wild Juvenile NA YoY OmyJC16SFCQ_0045 SFCQ
2016-07-12 Wild Juvenile NA YoY OmyJC16SFCQ_0041 SFCQ

PCA

Our first examination of potential genetic structure among the individuals is a principal component analysis.

The first step is to assess the number of PCs to retain in the analysis. We do this using the Kaiser Guttman criterion (below) and a broken stick model (below)

# set missing data to mean allele freq (PCA does not accomodate NAs)
X <- scaleGen(genind_2.0,  NA.method="mean")


#then run pca, keep all PCs
pca1 <- dudi.pca(X, scale = FALSE, scannf = FALSE, nf = 71)

### check pcs to keep with kaiser-guttman and broken stick

#kaiser guttman
cutoff<-mean(pca1$eig)
kg <- length((pca1$eig)[(pca1$eig)>cutoff])
barplot(pca1$eig, main = "PCA eigenvalues\nKaiser-Guttman Criteria (red line)")
abline(h = cutoff, col = "red")

#broken stick
n <- length(pca1$eig)
bsm <- data.frame(j=seq(1:n), p = 0)
bsm$p[1] <- 1/n
for (i in 2:n){
  bsm$p[i] <- bsm$p[i-1]+(1/(n+1-i))
  
}
bsm$p <- 100*bsm$p/n

pca_eigs_to_plot <- as.data.frame(cbind(100*pca1$eig/sum(pca1$eig)), rev(bsm$p))
pca_eigs_to_plot %<>%
  rownames_to_column(var = "bsm") %>%
  rename(pca_eig_perc = V1) %>%
  mutate(pca_eig_perc = as.numeric(pca_eig_perc))

pca_eigs_to_plot %<>%
  rowid_to_column("row_n") %>%
  mutate(bsm = as.numeric(bsm)) %>%
  pivot_longer(!row_n, names_to = "bsm_or_eig", values_to = "percent_variance")

ggplot(data = pca_eigs_to_plot[1:71,])+geom_bar(aes(x = as.factor(row_n), y = percent_variance, color = bsm_or_eig, fill = bsm_or_eig), stat = "identity", position=position_dodge())+theme_classic()+xlab("Eigenvector")+ylab("percent of variance")+ggtitle("Broken Stick Model")

The Kaiser-Guttman criterion (liberal) suggests retaining variation at the first 30 PCs, while the broken stick model (conservative) suggests no PCs are relevant.

PCA Results

We plot the first 4 PC axes below according to population (North vs South Fork) and origin (hatchery vs. natural origin/wild) and their interaction.

North vs South Fork

#kept all PCs
snp_pcs <- pca1$li#[,c(1:kg)]

#now plot data
snp_pcs %<>%
  rownames_to_column("sample") %>%
  left_join(select(genos_2.2, sample, pop, `Hat/Wild`))

ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis2, color = pop)) + stat_ellipse(aes(Axis1, Axis2, color = pop)) +theme_classic()+scale_color_viridis_d(name = "North or South Fork")

ggplot(data = snp_pcs)+geom_point(aes(Axis3, Axis4, color = pop)) + stat_ellipse(aes(Axis1, Axis2, color = pop)) +theme_classic()+scale_color_viridis_d(name = "North or South Fork")

#3d plot as well
plotly::plot_ly(x=snp_pcs$Axis1, y=snp_pcs$Axis2, z=snp_pcs$Axis3, type="scatter3d", mode="markers", color=snp_pcs$pop, alpha = 0.8)

There is no apparent genetic structure in principal component space between North and South Fork samples.

HOR vs NOR
Now let’s compare all wild (NOR) to all hatchery origin (HOR) individuals.

ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis2, color = `Hat/Wild`)) + stat_ellipse(aes(Axis1, Axis2, color = `Hat/Wild`)) +theme_classic()+scale_color_viridis_d(name = "HOR or NOR", begin = 0.2, end = 0.8)

ggplot(data = snp_pcs)+geom_point(aes(Axis3, Axis4, color = `Hat/Wild`)) + stat_ellipse(aes(Axis1, Axis2, color =`Hat/Wild`)) +theme_classic()+scale_color_viridis_d(name = "HOR or NOR", begin = 0.2, end = 0.8)

#3d plot as well
plotly::plot_ly(x=snp_pcs$Axis1, y=snp_pcs$Axis2, z=snp_pcs$Axis3, type="scatter3d", mode="markers", color=snp_pcs$`Hat/Wild`, alpha = 0.8, colors = viridis::viridis(2, begin = 0.2, end = 0.8))

Also no apparent structure in PC space between hatchery and wild.

Full Interaction (Origin * Fork)

Let’s also examine each stock separately. This will allow us to examine potential structure between North Fork Hatchery and South Fork wild stocks, for example

ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis2, color = interaction(`Hat/Wild`,pop))) + stat_ellipse(aes(Axis1, Axis2, color = interaction(`Hat/Wild`,pop))) +theme_classic()+scale_color_viridis_d(name = "HOR or NOR\nand North or South Fork")

ggplot(data = snp_pcs)+geom_point(aes(Axis3, Axis4, color = interaction(`Hat/Wild`,pop))) + stat_ellipse(aes(Axis3, Axis4, color = interaction(`Hat/Wild`,pop))) +theme_classic()+scale_color_viridis_d(name = "HOR or NOR\nand North or South Fork")

#3d plot as well
plotly::plot_ly(x=snp_pcs$Axis1, y=snp_pcs$Axis2, z=snp_pcs$Axis3, type="scatter3d", mode="markers", color=interaction(snp_pcs$`Hat/Wild`, snp_pcs$pop), alpha = 0.8, colors = viridis::viridis(4, begin = 0, end = 1))

Substantial overlap in PC space for this comparison as well. However, there may be a very subtle difference in group centroids between hatchery North Fork samples and all others. Examination with DAPC and FST should better characterize the scale of this potential difference.

DAPC

PCA will fail to find structure when both FST and the number of markers is low. Next we will attempt find a combination of alleles in the dataset that maximizes differences between groups of individuals using discriminant analysis of principal components (DAPC).

North vs South Fork

First, we’ll combine hatchery and wild samples within each fork to look for structure within the basin. This assumes HOR/NOR differences are limited within a fork (we’ll investigate this second possibility later).

Set Up

First we need to assess the correct number of PCs to retain in the DAPC, including too many can lead to overfitting and observation of among-group differences that are unlikely to be biologically meaningful. Below we use cross validation and the a.score approach to find the optimum number of PCs while avoiding overfitting.

# run this interactively to find the optimum number of PCs without overfitting

# first fit a DAPC and create the other dataframe needed to run a cross validation
dapc_full <- dapc(genind_2.0, n.pca = 71, n.da = 1)

mat <- as.matrix(scaleGen(genind_2.0, NA.method="mean", scale=FALSE, center=FALSE))
xpop <- pop(genind_2.0)
xval <- xvalDapc(mat, xpop, n.pca.max = 71, training.set = 0.9, result = "overall", center = TRUE, scale = FALSE, n.pca = seq(1,71, length.out = 71), n.rep = 500, xval.plot = TRUE)

# 22 was the best number of PCs achieving the lowest MSE / highest proportion of successful assignment 



# we will use the lowest value in the final DAPC to avoid overfitting, here 17 pcs

Cross validation using 500 replicates of 90%:10% training:test datasets found 19 PCs to achieve the lowest mean square error rate (28.8%) and highest correct assignment rate, successfully assigning 75.6% of samples in the test set to the correct source population. The median, 2.5% and 97.5% confidence intervals for random permutation of the data was 49%, 41% and 60% respectively, suggesting that this assignment power is much greater than expected by chance alone.

Results

Below we show the results of the DAPC with 19 PCs

DAPC Figure

dapc_19 <- dapc(genind_2.0, n.pca = 19, n.da =1)


plot_data <- as.data.frame(dapc_19$ind.coord)
plot_data$pop <- as.character(genind_2.0$pop)

ggplot(data=plot_data)+geom_density(aes(x=LD1, color = pop, fill = pop), alpha = 0.5)+theme_classic()+scale_color_viridis_d(name = "North or South Fork")+scale_fill_viridis_d(name = "North or South Fork")

The density plot above demonstrates that using 19 principal components derived from 347 genetic markers we can identify subtle structure between North and South Fork samples. Note that while there is a difference in the mean value of the discriminant axis for each population, there is substantial overlap. This incomplete discrimination suggests subtle structure.

Variable Loadings
Which markers contribute to this structure and are they enriched for a particular annotation?

The plot and table below shows variable loadings (contribution to the first discriminant axis for each allele).

#get variable loadings
marker_loadings1 <- loadingplot(dapc_19$var.contr, axis=1,thres=.006, lab.jitter=1, main = "loading plot for DA 1", )

markers1 <- unique(substr(names(marker_loadings1$var.values),1,nchar(names(marker_loadings1$var.values))-2))

#get marker annotations

marker_mapping <- readxl::read_xlsx("metadata/final_mapping_results.xlsx", sheet = 1)

marker_mapping %<>%
  mutate(marker = str_replace(marker, "Omy(\\d+)", "Chr\\1")) %>% #marker name convention is different
  mutate(marker = str_replace(marker, "\\.", "_")) %>% 
  mutate(neutral = if_else(str_detect(`Presumed Type`, 'Adaptive|adaptive'), "adaptive", "neutral"))

mls <- as.data.frame(marker_loadings1$var.values) %>%
  rownames_to_column(var = "marker") %>%
  mutate(marker = str_sub(marker, 0, nchar(marker) -2)) %>%
  distinct(marker, .keep_all= TRUE) %>%
  rename("loading_value" = "marker_loadings1$var.values") %>%
  mutate(loading_value = loading_value*2) %>%
  left_join(select(marker_mapping, marker, `Presumed Type`, chr, start)) %>%
  arrange(desc(loading_value)) %>%
  rename("Chromosome" = "chr", "Marker Position" = "start", "Annotation" = "Presumed Type", "Marker" = "marker", "Variable Loading" = "loading_value")
kable(mls)
Marker Variable Loading Annotation Chromosome Marker Position
Omy_aromat-280 0.0290176 Neutral 4 3391425
Omy_RAD24287-74 0.0287950 Adaptive. Basin-wide, top-outlier 8 28035667
Omy_star-206 0.0273631 Neutral 6 36624834
OMS00003 0.0241767 Neutral 1 59464291
Omy_96222-125 0.0228843 Neutral 15 24041060
Omy_RAD62596-38 0.0216128 Neutral 7 49977729
Omy_128996-481 0.0204080 Neutral 18 30802061
Omy_cd59-206 0.0201659 Neutral 26 8028280
Omy_RAD29700-18 0.0193766 Neutral 20 1673132
Omy_RAD7016-31 0.0192200 Adaptive. Basin-wide, top-outlier 6 51490729
Omy_b1-266 0.0176143 Neutral 21 41255723
OMS00179 0.0172761 Neutral 8 25539866
Omy_RAD49111-35 0.0172436 Adaptive. Basin-wide, top-outlier 19 40050059
OMS00057 0.0171366 Neutral 7 67908088
Omy_OmyP9-180 0.0168518 Neutral 29 15673363
Omy_117540-259 0.0166410 Neutral 12 5079321
Omy_RAD43694-41 0.0151115 Adaptive. Basin-wide, top-outlier 17 57728473
Omy_101832-195 0.0148230 Neutral 17 17015627
Omy_RAD24343-29 0.0145488 Adaptive. Mean air temperature (wettest quarter). Basin-wide, temperature-related 3 27928249
Omy_RAD59758-41 0.0126779 Adaptive. Minimum annual precipitation. Basin-wide, precipitation-related; 25 40219318
Omy_u09-52_284 0.0125023 Neutral 12 50716210
Omy_104519-624 0.0120619 Neutral 8 42764074

These results suggest that the subtle structure we observe among samples is driven a mix of neutral and putatively adaptive genetic markers spread throughout the genome. The markers above represent the top 22 markers that load onto this disciminant axis and explain 42% of the variation captured by it.

Full Interaction

Next we’ll do the full DAPC of population:origin (compare all four possible groupings - NOR North Fork, HOR North Fork, NOR South Fork, HOR South Fork)

Setup

#set new "populations" in new genind object
pops <- data.frame(row.names(genind_2.0$tab))
pops %<>%
  rename(sample="row.names.genind_2.0.tab.") %>%
  left_join(meta_data, by = c("sample" = "SFGL Id")) %>%
  mutate(int = interaction(`Hat/Wild`, pop))

genind_int <- genind_2.0
genind_int$pop <- pops$int

#    
# next do the xval and a.score procedure 
# run this interactively to find the optimum number of PCs without overfitting

# first fit a DAPC and create the other dataframe needed to run a cross validation
dapc_full <- dapc(genind_int, n.pca = 71, n.da = 3)

mat <- as.matrix(scaleGen(genind_int, NA.method="mean", scale=FALSE, center=FALSE))
xpop <- pop(genind_int)
xval <- xvalDapc(mat, xpop, n.pca.max = 71, training.set = 0.9, result = "overall", center = TRUE, scale = FALSE, n.pca = seq(1,71, length.out = 71), n.rep = 500, xval.plot = TRUE)

# 13 was the best number of PCs achieving the lowest MSE / highest proportion of successful assignment 


# we will use the lowest value in the final DAPC to avoid overfitting, here 12 pcs

Here, cross validation found that DAPC built on a training dataset could successfully assign a novel test sample back to one of four source populations 61% of the time when using the best number of PCs according to successful assignment rate or lowest mean square error, 22. The median, 2.5% and 97.5% confidence intervals for random permutation of the data was 25%, 16% and 36% respectively, suggesting that this assignment power is much greater than expected by chance alone.

Results

Below we show the results of the DAPC with 12 PCs

DAPC Figure

dapc_int <- dapc(genind_int, n.pca = 22, n.da =3)


plot_data <- as.data.frame(dapc_int$ind.coord)
plot_data$pop <- as.character(genind_int$pop)

ggplot(data=plot_data)+geom_point(aes(x=LD1, y = LD2, color = pop), alpha = 0.5)+theme_classic()+scale_color_viridis_d(name = "North vs South Fork\nand HOR vs NOR")+stat_ellipse(aes(x=LD1, y = LD2, color = pop))

ggplot(data=plot_data)+geom_point(aes(x=LD1, y = LD3, color = pop), alpha = 0.5)+theme_classic()+scale_color_viridis_d(name = "North vs South Fork\nand HOR vs NOR")+stat_ellipse(aes(x=LD1, y = LD3, color = pop))

ggplot(data=plot_data)+geom_density(aes(x=LD1, color = pop, fill = pop), alpha = 0.5)+theme_classic()+scale_color_viridis_d(name = "North or South Fork and HOR vs NOR")+scale_fill_viridis_d(name = "North or South Fork and HOR vs NOR")

ggplot(data=plot_data)+geom_density(aes(x=LD2, color = pop, fill = pop), alpha = 0.5)+theme_classic()+scale_color_viridis_d(name = "North or South Fork and HOR vs NOR")+scale_fill_viridis_d(name = "North or South Fork and HOR vs NOR")

ggplot(data=plot_data)+geom_density(aes(x=LD3, color = pop, fill = pop), alpha = 0.5)+theme_classic()+scale_color_viridis_d(name = "North or South Fork and HOR vs NOR")+scale_fill_viridis_d(name = "North or South Fork and HOR vs NOR")

temp <- summary(dapc_int)$assign.per.pop*100
par(mar=c(4.5,7.5,1,1))
barplot(temp, xlab="% of reassignment to group", horiz=TRUE, las=1)

The figures above demonstrate that there is structure among the four groups. 54% of variation in the dataset is constrained by the three discriminant axes with 51%, 34%, and 15% constrained by the first second and third discriminant axes (LDs).

The first discriminant axis (LD1) strongly separates the North Fork Hatchery stock from all other groups. The second discriminant axis (LD2) largely separates the South Fork Hatchery stock from all others groups, but discrimination along this axis is not complete suggesting more subtle structure. Finally LD3 separates Wild North and Wild South Fork samples, with the hatchery stocks intermediate, but again, the structure revealed by this genetic axis is very subtle.

Reassignment rates are somewhat inflated compared to cross-validated results, but this makes sense considering that the training dataset is larger when not leaving out individuals for the test dataset, and overfitting is possible when the full dataset is available to train the DAPC.

Variable Loadings
Which markers contribute to this structure and are they enriched for a particular annotation?

The plot and table below shows variable loadings for LD1 (contribution to the first discriminant axis for each allele).

#get variable loadings
marker_loadings1 <- loadingplot(dapc_int$var.contr, axis=1,thres=.007, lab.jitter=1, main = "loading plot for DA 1", )

markers1 <- unique(substr(names(marker_loadings1$var.values),1,nchar(names(marker_loadings1$var.values))-2))

#get marker annotations

marker_mapping <- readxl::read_xlsx("metadata/final_mapping_results.xlsx", sheet = 1)

marker_mapping %<>%
  mutate(marker = str_replace(marker, "Omy(\\d+)", "Chr\\1")) %>% #marker name convention is different
  mutate(marker = str_replace(marker, "\\.", "_")) %>% 
  mutate(neutral = if_else(str_detect(`Presumed Type`, 'Adaptive|adaptive'), "adaptive", "neutral"))

mls <- as.data.frame(marker_loadings1$var.values) %>%
  rownames_to_column(var = "marker") %>%
  mutate(marker = str_sub(marker, 0, nchar(marker) -2)) %>%
  distinct(marker, .keep_all= TRUE) %>%
  rename("loading_value" = "marker_loadings1$var.values") %>%
  mutate(loading_value = loading_value*2) %>%
  left_join(select(marker_mapping, marker, `Presumed Type`, chr, start)) %>%
  arrange(desc(loading_value)) %>%
  rename("Chromosome" = "chr", "Marker Position" = "start", "Annotation" = "Presumed Type", "Marker" = "marker", "Variable Loading" = "loading_value")
kable(mls, caption = "Discriminant Axis 1 Loadings")
Discriminant Axis 1 Loadings
Marker Variable Loading Annotation Chromosome Marker Position
Omy_aromat-280 0.0288300 Neutral 4 3391425
OMS00057 0.0254953 Neutral 7 67908088
OMS00103 0.0216795 Neutral 9 38335652
Omy_aldB-165 0.0210987 Neutral 25 21528133
Omy_RAD7210-8 0.0201542 Neutral 15 31707658
OMS00048 0.0197699 Neutral 23 37125601
Omy_RAD13034-67 0.0191904 Adaptive-Thermal Adaptation 26 12536995
OMS00003 0.0178864 Neutral 1 59464291
Omy_117540-259 0.0175760 Neutral 12 5079321
OMS00018 0.0164831 Neutral 16 46432395
Omy_111383-51 0.0162900 Neutral 15 21239707
Omy_RAD3209-10 0.0162187 Adaptive. Basin-wide, top-outlier 18 50984914
Omy_110689-148 0.0159564 Neutral 14 72355578
Omy_b1-266 0.0144412 Neutral 21 41255723
Omy_metA-161 0.0142776 Neutral 1 24257279
Omy_star-206 0.0142156 Neutral 6 36624834
Omy_ndk-152 0.0141369 Neutral 12 56277891
Omy_RAD5374-56 0.0140237 Adaptive. Minimum annual precipitation. Basin-wide, top-outlier 11 6186886
#get variable loadings
marker_loadings1 <- loadingplot(dapc_int$var.contr, axis=2,thres=.0065, lab.jitter=1, main = "loading plot for DA 2", )

markers1 <- unique(substr(names(marker_loadings1$var.values),1,nchar(names(marker_loadings1$var.values))-2))

#get marker annotations

marker_mapping <- readxl::read_xlsx("metadata/final_mapping_results.xlsx", sheet = 1)

marker_mapping %<>%
  mutate(marker = str_replace(marker, "Omy(\\d+)", "Chr\\1")) %>% #marker name convention is different
  mutate(marker = str_replace(marker, "\\.", "_")) %>% 
  mutate(neutral = if_else(str_detect(`Presumed Type`, 'Adaptive|adaptive'), "adaptive", "neutral"))

mls <- as.data.frame(marker_loadings1$var.values) %>%
  rownames_to_column(var = "marker") %>%
  mutate(marker = str_sub(marker, 0, nchar(marker) -2)) %>%
  distinct(marker, .keep_all= TRUE) %>%
  rename("loading_value" = "marker_loadings1$var.values") %>%
  mutate(loading_value = loading_value*2) %>%
  left_join(select(marker_mapping, marker, `Presumed Type`, chr, start)) %>%
  arrange(desc(loading_value)) %>%
  rename("Chromosome" = "chr", "Marker Position" = "start", "Annotation" = "Presumed Type", "Marker" = "marker", "Variable Loading" = "loading_value")
kable(mls, caption = "Discriminant Axis 2 Loadings")
Discriminant Axis 2 Loadings
Marker Variable Loading Annotation Chromosome Marker Position
Omy_96222-125 0.0288261 Neutral 15 24041060
Omy_RAD62596-38 0.0286550 Neutral 7 49977729
Omy_u09-52_284 0.0269002 Neutral 12 50716210
OMS00006 0.0262732 Neutral 16 63247901
Omy_104519-624 0.0242359 Neutral 8 42764074
Omy_RAD29700-18 0.0223229 Neutral 20 1673132
OMS00058 0.0186596 Neutral 22 19922108
OMS00064 0.0182929 Neutral 7 45227700
Omy_RAD7016-31 0.0181179 Adaptive. Basin-wide, top-outlier 6 51490729
Omy_star-206 0.0177269 Neutral 6 36624834
Omy_RAD65808-68 0.0149806 Adaptive. Mean precipitation (driest quarter). Basin-wide, top-outlier 1 12187627
Omy_RAD22123-69 0.0149345 Adaptive. Migration Distance to Ocean. Basin-wide, top-outlier 17 23756837
OMS00179 0.0149116 Neutral 8 25539866
Omy_sast-264 0.0148261 Neutral 18 28252045
Omy_aromat-280 0.0143678 Neutral 4 3391425
Omy_IL17-185 0.0139959 Neutral 22 27226007
Omy_mapK3-103 0.0134117 Neutral. Possible linkage 7 10975658

These marker loading results suggests the potential differences among the four groups are driven by 18 markers that collectively explain 33% of the variation along the first discriminant axis and 17 markers that collectively explain 33% of the variation along the second axis.

Markers with neutral and adaptive annotations drive both axes.

Genetic Differentiation

North vs South Fork

Next we will estimate the level of genetic differentiation between North and South Fork samples using FST.

Let’s calculate some basic F-statistics

fstat <- genind2hierfstat(genind_2.0)
colnames(fstat) <- c(pop, names(genind_2.0$loc.n.all))

basicstats <- basic.stats(fstat)
kable(basicstats$overall, caption = "Basic F-statistics of Dataset")
Basic F-statistics of Dataset
x
Ho 0.2954
Hs 0.3032
Ht 0.3041
Dst 0.0009
Htp 0.3050
Dstp 0.0017
Fst 0.0029
Fstp 0.0057
Fis 0.0257
Dest 0.0025

For FST let’s also estimate using the Weir and Cockerham method.

genet.dist(fstat, method="WC84")
##             NFCQ
## SFCQ 0.005731694

The estimated FST between North and South Fork samples is very small at 0.0057.

Full Interaction

Next we will estimate the level of genetic differentiation between all four groups using FST.

Let’s calculate some basic F-statistics

fstat <- genind2hierfstat(genind_int)
colnames(fstat) <- c(pop, names(genind_int$loc.n.all))

basicstats <- basic.stats(fstat)
kable(basicstats$overall, caption = "Basic F-statistics of Dataset")
Basic F-statistics of Dataset
x
Ho 0.2954
Hs 0.3013
Ht 0.3038
Dst 0.0025
Htp 0.3047
Dstp 0.0034
Fst 0.0083
Fstp 0.0111
Fis 0.0197
Dest 0.0048

For FST let’s also estimate using the Weir and Cockerham method.

genet.dist(fstat, method="WC84")
##               Hatchery.NFCQ   Wild.NFCQ Hatchery.SFCQ
## Wild.NFCQ       0.016006544                          
## Hatchery.SFCQ   0.015072079 0.011803694              
## Wild.SFCQ       0.015116550 0.003935760   0.004294291

The estimated maximum FST between any pair of samples is small at 0.016 and dataset wide FST is 0.0083 The maximum pairwise FST is observed between wild and hatchery North Fork samples. However, all comparisons that include the North Fork Hatchery sample are of similar magnitude and larger than other comparisons, corroborating the results from DAPC that this stock is the most different from any other.

The lowest pairwise FST is between wild North and South Fork samples.

Run-Timing Markers

We are always particularly interested in the patterns of diversity within genomic regions that have major impacts on ecologically relevant traits. Let’s examine the GREB1L/ROCK1 region associated with migration timing.

Allele Freqs
Below we make a heatmap of allele frequencies. Markers are arranged in genomic order.

run_timing_loci_names <- marker_mapping %>%
  filter(chr == "28" | CRITFC_chromosome == "28") %>%
  filter(str_detect(`Presumed Type`, 'run|Run')) %>%
  select(marker)

#different naming convention, lets fix
run_timing_loci_names <- str_replace(run_timing_loci_names$marker, "Omy28", "Chr28")

#
all_counts <- allele.dist(genind_int, mk.figures = FALSE)$count

#make into a dataframe
all_counts <- as.data.frame(do.call(rbind, all_counts))
colnames(all_counts) <- c("NF_Hatchery_count","NF_Wild_count", "SF_Hatchery_count", "SF_Wild_Count")
all_counts$sum <- rowSums(all_counts, na.rm = TRUE)

all_freqs <- allele.dist(genind_int, mk.figures = FALSE)$frequency
#make into a dataframe
all_freqs <- as.data.frame(do.call(rbind, all_freqs))

all_freqs <- as.data.frame(cbind(all_freqs, all_counts))

##### get only minor allele
all_freqs$marker <- genind_int$loc.fac

#now group by marker and keep the minor allele, then convert counts to 
all_maf <- all_freqs %>%
  group_by(marker) %>%
  slice_min(sum) %>%
  replace(., is.na(.), 0)

run_timing_maf <- all_maf %>%
  filter(marker %in% run_timing_loci_names) %>%
  left_join(select(marker_mapping, marker, CRITFC_SNP_pos_genome)) %>%
  arrange(CRITFC_SNP_pos_genome, .keep_all=TRUE)

#manually enter the position for one of these
run_timing_maf[12,11] <- "11702210"
run_timing_maf %<>%
  arrange(CRITFC_SNP_pos_genome, .keep_all=TRUE)

#definition of minor allele frequency broke for fixed marker, let's reset to 0
run_timing_maf[6,1:5] <- list(0,0,0,0,0)

tmat <- t(as.matrix(run_timing_maf[,1:4]))
colnames(tmat) <- run_timing_maf$marker
pheatmap::pheatmap(tmat, show_colnames  = T, cluster_cols = FALSE, main = "Minor Allele Frequency of Run-Timing Markers", cluster_rows = FALSE)

Generally, only subtle differences in allele frequency between North and South Fork steelhead samples at run timing markers. Interestingly, GREB1_05 demonstrates higher heterozygosity in BOTH hatchery stocks. We will discuss potential implications later.

Individual Genotypes
Sometimes it is useful to look at patterns among individuals instead of allele frequency.

Below we plot the genotypes of all samples at run-timing markers. Markers are arranged in genomic order. Rows (individuals) are hierarchically clustered within each population. Allele is displayed by color (overall minor allele homozygote = BLUE, heterozygote = yellow, major allele homozygote = red)

#use this to set minor allele
#colSums(genind_2.0[loc=run_timing_loci_names]$tab, na.rm = TRUE)

sep_genind28 <- seppop(genind_2.0[loc=run_timing_loci_names])

polarized_allele_counts <- as.data.frame(sep_genind28$NFCQ$tab[,c(1,3,5,7,9,11,13,15,17,19,20,22)]) %>%
  bind_rows(as.data.frame(sep_genind28$SFCQ$tab[,c(1,3,5,7,9,11,13,15,17,19,20,22)]))

colnames(polarized_allele_counts) <- str_sub(colnames(polarized_allele_counts), 1, nchar(colnames(polarized_allele_counts))-2)

polarized_allele_counts %<>%
  rownames_to_column(var = "sample") %>%
  mutate(pop = str_sub(sample, 8,11)) %>%
  relocate(run_timing_maf$marker) #reorder to reflect genome order
  

pac_nf <- polarized_allele_counts %>%
  filter(pop == "NFCQ")
pac_sf <- polarized_allele_counts %>%
  filter(pop == "SFCQ")
nf_heat <- pheatmap::pheatmap(pac_nf[,-c(13,14)], cluster_cols  = FALSE, show_rownames = FALSE, main = "North Fork Major Allele Count")

sf_heat <- pheatmap::pheatmap(pac_sf[,-c(13,14)], cluster_cols  = FALSE, show_rownames = FALSE, main = "South Fork Major Allele Count")

#use this to set minor allele
#colSums(genind_2.0[loc=run_timing_loci_names]$tab, na.rm = TRUE)

sep_genind28 <- seppop(genind_int[loc=run_timing_loci_names])

polarized_allele_counts <- as.data.frame(sep_genind28$Hatchery.NFCQ$tab[,c(1,3,5,7,9,11,13,15,17,19,20,22)]) %>%
  bind_rows(as.data.frame(sep_genind28$Wild.NFCQ$tab[,c(1,3,5,7,9,11,13,15,17,19,20,22)])) %>%
  bind_rows(as.data.frame(sep_genind28$Hatchery.SFCQ$tab[,c(1,3,5,7,9,11,13,15,17,19,20,22)])) %>%
  bind_rows(as.data.frame(sep_genind28$Wild.SFCQ$tab[,c(1,3,5,7,9,11,13,15,17,19,20,22)]))

colnames(polarized_allele_counts) <- str_sub(colnames(polarized_allele_counts), 1, nchar(colnames(polarized_allele_counts))-2)

polarized_allele_counts %<>%
  rownames_to_column(var = "sample") %>%
  left_join(select(pops, sample, int)) %>%
  relocate(run_timing_maf$marker) #reorder to reflect genome order
  

pac_Hatchery.NFCQ <- polarized_allele_counts %>%
  filter(int == "Hatchery.NFCQ")
pac_Wild.NFCQ <- polarized_allele_counts %>%
  filter(int == "Wild.NFCQ")

pac_Hatchery.SFCQ <- polarized_allele_counts %>%
  filter(int == "Hatchery.SFCQ")
pac_Wild.SFCQ <- polarized_allele_counts %>%
  filter(int == "Wild.SFCQ")

nfH_heat <- pheatmap::pheatmap(pac_Hatchery.NFCQ[,-c(13,14)], cluster_cols  = FALSE, show_rownames = FALSE, main = "North Fork Hatchery Major Allele Count")

nfW_heat <- pheatmap::pheatmap(pac_Wild.NFCQ[,-c(13,14)], cluster_cols  = FALSE, show_rownames = FALSE, main = "North Fork Wild Major Allele Count")

sfH_heat <- pheatmap::pheatmap(pac_Hatchery.SFCQ[,-c(13,14)], cluster_cols  = FALSE, show_rownames = FALSE, main = "South Fork Hatchery Major Allele Count")

sfW_heat <- pheatmap::pheatmap(pac_Wild.SFCQ[,-c(13,14)], cluster_cols  = FALSE, show_rownames = FALSE, main = "South Fork Wild Major Allele Count")

Regardless of how the data is structured, at run timing markers there appear to be two major clusters. These clusters were driven by the first three SNPs in the genomic region (Chr28_11607954, Omy_RAD52456-17 and Omy_GREB1_05). While all SNPs annotated as run-timing marker SNPs have demonstrated an association with this phenotype in some populations, in the nearby Rogue River, we have demonstrated that two of these SNPs are not diagnostic of run timing. The third (Omy_RAD52456-17) was not analyzed in that study. This suggests that genomic variation that generates these two clusters is unlikely to have a large phenotype effect. Similarly, there is high genetic diversity (heterozygosity) at a marker at the 3’ border of this region (Chr28_11773194), but this marker is not diagnostic of run timing in the Rogue River. This also suggests that the run timing marker with large differences in allele frequency between hatchery and wild stocks Omy_GREB1_05 is unlikely to have a major phenotypic impact.

Ignoring the first three and last marker, we also observed some genetic variation at other markers known to have a strong association with run-timing in the nearby Rogue River. Some individuals demonstrate a large number of minor alleles compared to our expectations given that all individuals demonstrate winter phenotypes. In both the NF and SF Coquille, a few individuals were heterozygous at many adult run timing markers. This result suggests that these individuals have one copy of the “early” run allele. In the NF Coquille, one individual was homozygous at one run timing marker suggesting that this individual has two copies of the “early” run allele. While we can not predict the phenotypic effect of this variation, it suggests Coquille winter run steelhead harbor some early-migration associated genetic variants.

Below we gather the metadata for individuals with many copies of proposed “early” alleles.

#rowSums(pac_nf[,4:12]) < 15
#pac_nf[rowSums(pac_nf[,4:12], na.rm = TRUE) < 15,14]

min_allele <- pac_nf %>%
  bind_rows(pac_sf) %>%
  select(-c(pop,"Chr28_11607954",  "Omy_RAD52458-17", "Omy_GREB1_05", "Chr28_11773194" )) %>% 
  rowwise(sample) %>%
  mutate(min_allele_ct = sum(c_across(cols = everything())=="1", na.rm = TRUE) + 2*sum(c_across(cols = everything())=="0", na.rm = TRUE)) %>%
  select(sample, min_allele_ct) %>%
  filter(min_allele_ct >= 3) %>%
  left_join(meta_data, by = c("sample" = "SFGL Id"))

#ggplot(data = min_allele)+geom_histogram(aes(x = min_allele_ct))

kable(min_allele, caption = "metadata from samples with more than 3 minor alleles")
metadata from samples with more than 3 minor alleles
sample min_allele_ct Date Vial # Hat/Wild Adult/Juv Sex Est. Age Pedigree pop
OmyJC16NFCQ_0014 4 2016-07-11 44 StW 014 Wild Juvenile NA YoY OmyJC16NFCQ NFCQ
OmyJC16NFCQ_0019 6 2016-07-13 44 StW 019 Wild Juvenile NA 1+ OmyJC16NFCQ NFCQ
OmyJC16NFCQ_0045 3 2016-03-30 44 StW 045 Hatchery Juvenile NA 1 OmyJC16NFCQ NFCQ
OmyAC16SFCQ_0002 6 2016-02-17 144 StW 002 Hatchery Adult F NA OmyAC16SFCQ SFCQ
OmyJC16SFCQ_0029 3 2016-03-30 144 StW 029 Hatchery Juvenile NA 1 OmyJC16SFCQ SFCQ
OmyJC16SFCQ_0031 3 2016-07-12 144 StW 031 Wild Juvenile NA 1+ OmyJC16SFCQ SFCQ

Summary

Differentiation We examined genetic variation at 347 genetic markers among 34 North Fork and 37 South Fork Coquille River Steelhead. Overall FST was low (0.0083), maximum pairwise FST was ~0.016, and PCA did not reveal any clear structure to the data. Using a discriminant analysis of principal components to discriminate betweenNorth or South Fork samples ignoring hatchery/wild status, we observed subtle structure driven by a mix of neutral and putatively adaptive markers. The genetic structure observed at these markers was insufficient to fully discriminate between the groups, however cross validation suggests that individuals could be successfully assigned to their source population using genetic information with 76% accuracy.

Using DAPC to discriminate among all four possible groupings (North Fork Hatchery, North Fork Wild, South Fork Hatchery, South Fork Wild), we were able to describe an axis of genetic variation that strongly discriminated between North Fork Hatchery stocks and all others, and additional axes that revealed subtler structure. Cross validation of this DAPC demonstrated that we could assign individuals back to their source group with 61% success, substantially greater than expected by chance alone (25%). Small sample size, particularly in our sample of the North Fork hatchery stock, may have led to high variance in our some of our estimates, however, our cross validation result should be robust to this effects.

These results suggest there are no strong genetic differences between our North and South Fork samples, or between hatchery stocks and wild populations within and across both forks, but subtle structure in the data (overall FST of 0.083) was sufficient to discriminate between North Fork Coquille hatchery stock individuals and all others.

Importantly, our GTseq panel only provides a small snapshot of the genetic variation within and among these groups and other potentially ecologically relevant genetic differences between these groups may exist. Also sample size

Run timing markers There was limited variation among run-timing associated markers within or between our samples. Four markers showed higher genetic diversity than the others. These markers flank the genomic region associated with run-timing and three of the four have been observed to have low correlation with run-timing phenotypes in the nearby Rogue River.
We also observed some genetic variation at other markers in the genomic region known to have an association with run-timing in the nearby Rogue River. A small number individuals carry many minor, early-migration associated alleles compared to our expectations given that all individuals demonstrate late-migration phenotypes. This suggests early-run alleles may be found among Coquille River winter steelhead, but the phenotypic impact of these alleles is not known.